##
## FIGURE - CLUSTERING OF REGIONS -------------------------------- 
##

rm(list=ls())

###
### PACKAGES -----------------------------------------------------
### 

library(tidyverse)
library(ggpubr)
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
library(psych)
library(stringr)


theme <- theme_bw() + 
  theme(plot.title = element_text(hjust = 0),
        axis.title.x=element_text(hjust = 1.0, size=11),
        axis.title.y=element_text(size=11),
        panel.grid=element_blank())

##
## LOADING DATA ----------------------------------------------------------
##

load(file="Complete DLS data file_regional level.RData")
dls <- dls_region

###
### CLUSTER --------------------------------------
###

# > inspecting adaptation strategies

summary(dls$dim1_housing_region)
summary(dls$dim2_thermal_region)
summary(dls$dim3_nutrition_region)
summary(dls$dim4_foodprep_region)
summary(dls$dim5_water_region)
summary(dls$dim6_sanitation_region)
summary(dls$dim7_health_region)
summary(dls$dim8_education_region)
summary(dls$dim9_socialconnect_region)
summary(dls$dim10_physicalconnect_region)

# > creating a separate cluster data set to determine number of clusters
clus <- dls %>% 
  group_by(country_name, region_num_raw) %>% 
  mutate(wave_max = max(wave)) %>% 
  ungroup() %>% 
  filter(wave == wave_max)

clus <- clus %>% 
  select(contains("dim"))

# > omit any cases with missing information
#clus <- na.omit(clus) 

# > fill missing values

clus <- clus %>% 
  mutate(dim1_housing_region  = ifelse(is.na(dim1_housing_region), mean(dim1_housing_region, na.rm=T), dim1_housing_region),
         dim2_thermal_region   = ifelse(is.na(dim2_thermal_region), mean(dim2_thermal_region, na.rm=T), dim2_thermal_region ),
         dim3_nutrition_region   = ifelse(is.na(dim3_nutrition_region), mean(dim3_nutrition_region, na.rm=T), dim3_nutrition_region ),
         dim4_foodprep_region   = ifelse(is.na(dim4_foodprep_region), mean(dim4_foodprep_region, na.rm=T), dim4_foodprep_region ),
         dim5_water_region   = ifelse(is.na(dim5_water_region), mean(dim5_water_region, na.rm=T), dim5_water_region ),
         dim6_sanitation_region  = ifelse(is.na(dim6_sanitation_region), mean(dim6_sanitation_region, na.rm=T), dim6_sanitation_region),
         dim7_health_region   = ifelse(is.na(dim7_health_region), mean(dim7_health_region, na.rm=T), dim7_health_region ),
         dim8_education_region   = ifelse(is.na(dim8_education_region), mean(dim8_education_region, na.rm=T), dim8_education_region ),
         dim9_socialconnect_region   = ifelse(is.na(dim9_socialconnect_region), mean(dim9_socialconnect_region, na.rm=T), dim9_socialconnect_region ),
         dim10_physicalconnect_region  = ifelse(is.na(dim10_physicalconnect_region), mean(dim10_physicalconnect_region, na.rm=T), dim10_physicalconnect_region))

summary(clus$dim1_housing_region)
summary(clus$dim2_thermal_region)
summary(clus$dim3_nutrition_region)
summary(clus$dim4_foodprep_region)
summary(clus$dim5_water_region)
summary(clus$dim6_sanitation_region)
summary(clus$dim7_health_region)
summary(clus$dim8_education_region)
summary(clus$dim9_socialconnect_region)
summary(clus$dim10_physicalconnect_region)

# > standardize key variables

clus <- scale(clus[ ,1:10])

clus <- as.data.frame(clus) %>% 
  rename("Housing"="dim1_housing_region",
         "Thermal"="dim2_thermal_region",
         "Nutrition"="dim3_nutrition_region",
         "Food"="dim4_foodprep_region",
         "Water"="dim5_water_region",
         "Sanitation"="dim6_sanitation_region",
         "Health"="dim7_health_region",
         "Education"="dim8_education_region",
         "Social"="dim9_socialconnect_region",
         "Physical"="dim10_physicalconnect_region")

# > calculate and visualize distance matrix
distance <- get_dist(clus)
#g1 <- fviz_dist(distance, gradient = list(low = "#3cc2c7", mid = "white", high = "#3cc76c"))

#ggsave(g1, filename= "figure_cluster analysis_distance mattrix between studies.png", width=11, height=11)

# > inspecting results for different numbers of clusters
set.seed(123)
k1 <- kmeans(clus, centers = 1, nstart = 25)
set.seed(123)
k2 <- kmeans(clus, centers = 2, nstart = 25)
set.seed(123)
k3 <- kmeans(clus, centers = 3, nstart = 25)
set.seed(123)
k4 <- kmeans(clus, centers = 4, nstart = 25)
set.seed(123)
k5 <- kmeans(clus, centers = 5, nstart = 25)
set.seed(123)
k6 <- kmeans(clus, centers = 6, nstart = 25)
set.seed(123)
k7 <- kmeans(clus, centers = 7, nstart = 25)
set.seed(123)
k8 <- kmeans(clus, centers = 8, nstart = 25)
set.seed(123)
k9 <- kmeans(clus, centers = 9, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k1, geom = "point", data = clus) + ggtitle("Number of clusters = 1")+theme
p2 <- fviz_cluster(k2, geom = "point", data = clus) + ggtitle("Number of clusters = 2")+theme
p3 <- fviz_cluster(k3, geom = "point",  data = clus) + ggtitle("Number of clusters = 3")+theme
p4 <- fviz_cluster(k4, geom = "point",  data = clus) + ggtitle("Number of clusters = 4")+theme
p5 <- fviz_cluster(k5, geom = "point",  data = clus) + ggtitle("Number of clusters = 5")+theme
p6 <- fviz_cluster(k6, geom = "point",  data = clus) + ggtitle("Number of clusters = 6")+theme
p7 <- fviz_cluster(k7, geom = "point",  data = clus) + ggtitle("Number of clusters = 7")+theme
p8 <- fviz_cluster(k8, geom = "point",  data = clus) + ggtitle("Number of clusters = 8")+theme
p9 <- fviz_cluster(k9, geom = "point",  data = clus) + ggtitle("Number of clusters = 9")+theme

p19 <- ggarrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, labels=c("A", "B", "C", "D", "E", "F", "G", "H", "I"))
p19

ggsave(plot = p19, filename= "supplementary figure s6.png", width=10, height=7)

# > what is the optimal number of clusters

set.seed(123)

g1 <- fviz_nbclust(clus, kmeans, method = "wss") +
  geom_vline(xintercept = 4, color = "red")+
  theme +
  ggtitle("")+
  theme(plot.caption = element_text(hjust = 0, face= "italic"), #Default is hjust=1
        plot.title.position= "plot")

g2 <- fviz_nbclust(clus, kmeans, method = "silhouette") +
  geom_vline(xintercept = 4, color = "red")+
  theme +
  ggtitle("")+
  theme(plot.caption = element_text(hjust = 0, face= "italic") ,
        plot.title.position = "plot")

g12 <- ggarrange(g1, g2, labels=c("A", "B"))
g12

ggsave(plot = g12, file="supplementary figure s7.png", width=10, height=4)

# > Final
set.seed(123)
final <- kmeans(clus, 4, nstart = 123)
print(final)

g1 <- fviz_cluster(final, data = clus, labelsize=4) +
  theme+
  ggtitle("Cluster plot by first principal components")+
  theme(plot.caption = element_text(hjust = 0, face= "italic"),  
        plot.title.position = "plot")

g1

clus <- as.data.frame(clus) %>%
  mutate(cluster = final$cluster)  

# > creating a separate cluster data set to determine number of clusters
dls_clus <- dls %>% 
  select(SurveyId, country_name, region_num_raw, wave, contains("dim")) %>% 
  rename("country_name"="country_name") %>% 
  group_by(country_name, region_num_raw) %>% 
  mutate(wave_max = max(wave)) %>% 
  ungroup() %>% 
  filter(wave == wave_max)

dls_clus <- cbind(dls_clus, clus)
dls.check <- dls_clus %>% filter(country_name == "India")

describeBy(dls_clus, dls_clus$cluster)

dls_clus <- dls_clus %>% 
  mutate(order = recode(cluster,
                          "1" = 1L, 
                          "2" = 3L,
                          "3" = 2L,
                          "4" = 4L)) %>% 
  mutate(cluster = recode(cluster, 
                          "3" = "B Moderate to high deprivation cluster (n=195)", 
                          "2" = "C Low to moderate deprivation cluster (n=312)", 
                          "4" = "D Low deprivation cluster (n=266)",
                          "1" = "A High deprivation cluster (n=330)"))
table(dls_clus$cluster)

dls_clus <- dls_clus %>% mutate(cluster=as.factor(cluster))

load("supplementary figure s5a.RData")

g2 <- dls_clus %>% 
  select(Housing:Physical, cluster) %>% 
  gather(Housing:Physical, key=dimension, value=sdeviation) %>% 
  ggplot()+
  geom_boxplot(mapping=aes(x=dimension, y=sdeviation), 
               fill="#e6f0ef",alpha=0.6, outlier.shape = NA) +
  geom_hline(mapping=aes(yintercept=0), color="Red", linetype="dashed")+
  facet_wrap(vars(cluster), nrow=4)+
  coord_cartesian(ylim=c(-3,3))+
  theme(axis.text.x = element_text(angle=90, hjust=1))+
  theme_bw()+
  xlab("")+ylab("Standardized deviations from global average")+
  ggtitle("")

g2

#ggsave(plot=g2, file="supplementary figure s5b.png")


##
## CLUSTER MAP
## 

library(sf) 
library(countrycode)

table(dls_clus$cluster, useNA = "always")

#> keeping only cases in the data for the last available year
aux1 <- dls_clus %>% 
  #filter(year == year_max) %>% 
  mutate(cluster = recode(
    cluster,
    "A High deprivation cluster (n=330)" = "High",
    "B Moderate to high deprivation cluster (n=195)"="Moderate to high",
    "C Low to moderate deprivation cluster (n=312)"="Low to moderate",
    "D Low deprivation cluster (n=266)"="Low"
  ))

table(aux1$cluster, useNA = "always")

map.world <- map_data('world') %>% 
  rename("Country"="region")  

geo_shp <- read_sf("dhs_shapefile.gpkg")
class(geo_shp)

#> GEOLEVEL is the main identfier. Here I make sure it's numeric
geo_shp <- geo_shp %>% 
  left_join(aux1, by=c("CountryName" = "country_name",
                       "RegionId" = "region_num_raw"))
geo_shp <- geo_shp %>% filter(CountryName != "Paraguay")
table(geo_shp$cluster, useNA = "always")

d.check <- geo_shp %>% filter(is.na(cluster))

g3 <- ggplot() +
  geom_polygon(data=map.world, 
               aes( x = long, y = lat, group = group), 
               fill="#dedede")+
  geom_sf(data=geo_shp, 
          aes(fill=as.factor(cluster), 
              size = NA))+
  theme_light()+
  theme(axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.position="bottom")+
  coord_sf(ylim = c(-55,80),xlim=c(-150,160))+
  scale_fill_manual(name="Clusters with different degrees of deprivation",
                    values=c( "#ba0700", "#e6352e", "#f0918d", "#f7dddc"))+
  ggtitle("")+
  guides(fill = guide_legend(title.position = "top", title.hjust=0.5,
                           nrow=1, 
                           label.position = "bottom",
                           label.theme = element_text(size=8),
                           keywidth = 5)) 
                    
#ggsave(plot=g3, file="figure_map with cluster locations.png")

g123 <- ggarrange(
  ggarrange(g1,g2, labels=c("A", "B"), ncol=2, nrow=1),
  ggarrange(g3, labels = c("C")),
  nrow=2, ncol=1,
  heights=c(1,1.4),
  widths=c(1,1),
  font.label=list(size=18)
)

ggsave(plot=g123, file="supplementary figure s5.png", width = 11, height=10)
ggsave(plot=g123, file="supplementary figure s5.svg", width = 11, height=10)

